Nbastats <- read.csv(("C://Users/Tripl/OneDrive/Documents/R/nba.games.stats.csv"), stringsAsFactors = FALSE)
library(prettydoc)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(animation)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gganimate)
library(wesanderson)
Nbastats<- Nbastats %>%
mutate(Date = lubridate::as_date(Date),
season = ifelse(Date > "2014-10-01" & Date < "2015-06-30", "2014-2015",
ifelse(Date > "2015-10-01" & Date < "2016-06-30", "2015-2016",
ifelse(Date > "2016-10-01" & Date < "2017-06-30", "2016-2017", "2017-2018"))),
Win01= ifelse (WINorLOSS=="W", 1, 0))
Here I created a season column so as to help later in my analysis in sifting through season data. Due to changeovers in roster or coaches, individual seasons should be established for data integrity to be maintained. I also created a win loss column in order to later establish how many of the 82 games in a given season each individual team actually won.
Nbastats <- Nbastats %>%
mutate(EFGperc = (FieldGoals + (.5*X3PointShots))/FieldGoalsAttempted)
Effective field goal percentage is a statistic that bears more relevance then the regular field goal percentage because it accounts for the fact that a made three is worth more than a two point field-goal, despite that regular field goal percentage just weighs them all the same as “shots.”
TeamSeasonEFG <- Nbastats %>%
mutate(EFGperc = (FieldGoals + (.5*X3PointShots))/FieldGoalsAttempted) %>%
group_by(Team, season) %>%
summarize(meanEFG = mean(EFGperc), WinCount = sum(Win01), THREEPOINTSHOTSPERGAME = mean(X3PointShotsAttempted), OppPtsPerGame = mean(OpponentPoints), PointsPerGame = mean(TeamPoints), SPG = mean(Steals), Opp3PtShotsAttempted = mean(Opp.3PointShotsAttempted), OppTPG = mean(Opp.Turnovers), TPG = mean(Turnovers), APG = mean(Assists))
Here, I created a summary table of the data sorting by team and season in order to have the teams Effective FG% for the season and their wins in each season. I also included a bunch of other mean statistics for future exploration.
WinCountPlot <- ggplot(data=TeamSeasonEFG, aes(TeamSeasonEFG$WinCount)) + geom_bar(fill="black", color = "white") + theme_light()
WinCountPlot + scale_fill_gradient(low="blue", high="red")
WinCountScaled <- ggplot()+ geom_bar(data = TeamSeasonEFG[TeamSeasonEFG$WinCount < 30,], mapping = aes(WinCount), fill = "red", color="black")
WinCountColored <- WinCountScaled + geom_bar(data = TeamSeasonEFG[TeamSeasonEFG$WinCount > 49,], mapping = aes(WinCount), fill = "gold", color="black")+ geom_bar(data = TeamSeasonEFG[TeamSeasonEFG$WinCount > 29 & TeamSeasonEFG$WinCount < 50 ,], mapping = aes(WinCount), fill = "black", color="red") + annotate("text", x =19 , y = 6, label = "Pray For Luka or Zion") + annotate("text", x =40 , y = 7, label = "NBA Purgatory") + annotate("text", x =64 , y = 6, label = "There Can Only Be One") + ggtitle("An Examination Of Win Count")
WinCountColored
WinCountColoredFacet <- WinCountScaled + geom_bar(data = TeamSeasonEFG[TeamSeasonEFG$WinCount > 49,], mapping = aes(WinCount), fill = "gold", color="black")+ geom_bar(data = TeamSeasonEFG[TeamSeasonEFG$WinCount > 29 & TeamSeasonEFG$WinCount < 50 ,], mapping = aes(WinCount), fill = "black", color = "red") + annotate("text", x =20 , y = 5, label = "Bottom Feeders") + annotate("text", x =40 , y = 7, label = "Purgatory") + annotate("text", x =64 , y = 5, label = "Title Threats") + facet_wrap(~season) + ggtitle("An Examination Of Win Count By Season")
WinCountColoredFacet
Analysis: The above histograms show a count of win counts, colored by where the team lies in terms of practical ability to win the NBA championship. 50 is by no means a “magic number” but it is a pretty good indicator of the type of team that typically has demonstrated the ability to win the NBA championship. In breaking the historgram up by season, the insight gained is telling. As teams have begun to dive deeper and depper into the analytics of basketball, exploiting every single possible rule within the system has become more and more common. Thus, it is no surprise that 2017-2018 saw a significant increase in “bottom feeder” teams from 2014-2015. In addition to the analytics explosion, the relative sucess of the 76ers tanking experiment “The Process” could have factored into more ownership groups being willing to go all in on the full blown awful.
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
WinCorrWithNAREMOVAL<- cor(TeamSeasonEFG[, 3:9], use = "pairwise.complete.obs")
knitr::kable(WinCorrWithNAREMOVAL)
| meanEFG | WinCount | THREEPOINTSHOTSPERGAME | OppPtsPerGame | PointsPerGame | SPG | Opp3PtShotsAttempted | |
|---|---|---|---|---|---|---|---|
| meanEFG | 1.0000000 | 0.6865368 | 0.5667305 | 0.1561670 | 0.8305779 | 0.2205922 | 0.3504467 |
| WinCount | 0.6865368 | 1.0000000 | 0.2644350 | -0.4404047 | 0.5580927 | 0.2186950 | -0.1177219 |
| THREEPOINTSHOTSPERGAME | 0.5667305 | 0.2644350 | 1.0000000 | 0.3615865 | 0.6089860 | 0.0660815 | 0.3997115 |
| OppPtsPerGame | 0.1561670 | -0.4404047 | 0.3615865 | 1.0000000 | 0.4679852 | -0.0181805 | 0.6563483 |
| PointsPerGame | 0.8305779 | 0.5580927 | 0.6089860 | 0.4679852 | 1.0000000 | 0.1854090 | 0.4583397 |
| SPG | 0.2205922 | 0.2186950 | 0.0660815 | -0.0181805 | 0.1854090 | 1.0000000 | 0.1359623 |
| Opp3PtShotsAttempted | 0.3504467 | -0.1177219 | 0.3997115 | 0.6563483 | 0.4583397 | 0.1359623 | 1.0000000 |
WinCountCorPlot <- corrplot::corrplot(WinCorrWithNAREMOVAL, order = "hclust")
WinCountCorPlot
## OppPtsPerGame Opp3PtShotsAttempted SPG
## OppPtsPerGame 1.00000000 0.6563483 -0.01818052
## Opp3PtShotsAttempted 0.65634826 1.0000000 0.13596232
## SPG -0.01818052 0.1359623 1.00000000
## WinCount -0.44040468 -0.1177219 0.21869500
## THREEPOINTSHOTSPERGAME 0.36158654 0.3997115 0.06608148
## meanEFG 0.15616700 0.3504467 0.22059216
## PointsPerGame 0.46798518 0.4583397 0.18540902
## WinCount THREEPOINTSHOTSPERGAME meanEFG
## OppPtsPerGame -0.4404047 0.36158654 0.1561670
## Opp3PtShotsAttempted -0.1177219 0.39971154 0.3504467
## SPG 0.2186950 0.06608148 0.2205922
## WinCount 1.0000000 0.26443500 0.6865368
## THREEPOINTSHOTSPERGAME 0.2644350 1.00000000 0.5667305
## meanEFG 0.6865368 0.56673055 1.0000000
## PointsPerGame 0.5580927 0.60898596 0.8305779
## PointsPerGame
## OppPtsPerGame 0.4679852
## Opp3PtShotsAttempted 0.4583397
## SPG 0.1854090
## WinCount 0.5580927
## THREEPOINTSHOTSPERGAME 0.6089860
## meanEFG 0.8305779
## PointsPerGame 1.0000000
Analysis:
In unsurprising fashion, numerous worthwhile correlations were are to be found in this matrix. Specifically looking at mean EFG, it showes strong correlation to win count, points per game, and three point shots per game.
If a team has more assists over the course of a season, than they will have a higher win count.
WLBoxplot <- ggplot(Nbastats, aes(WINorLOSS, Assists)) +
geom_boxplot(fill = "blue", color = "red") +
theme_minimal()
WLBoxplot
If a team gets more steals, then its opponent will score less.
Assignment3lmtest <- lm(OppPtsPerGame ~ SPG, data = TeamSeasonEFG)
summary(Assignment3lmtest)
##
## Call:
## lm(formula = OppPtsPerGame ~ SPG, data = TeamSeasonEFG)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7027 -3.0317 0.2758 3.0891 9.7196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104.37587 3.68412 28.331 <2e-16 ***
## SPG -0.09335 0.47261 -0.198 0.844
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.316 on 118 degrees of freedom
## Multiple R-squared: 0.0003305, Adjusted R-squared: -0.008141
## F-statistic: 0.03902 on 1 and 118 DF, p-value: 0.8438
Analysis: I fail to reject the null hypothesis. The p-value was not statistically significant in any way.
OppPtsVsSteals <- ggplot(TeamSeasonEFG, aes(SPG, OppPtsPerGame, col = season)) + geom_smooth(method = "lm", se = FALSE) + theme_classic()+ ggtitle("2014-2018 Analysis: Mean Points Allowed Per Game vs. Mean Steals Per Game")
OppPtsVsSteals
Analysis: The above visual actually shows what I had already discussed in regard to rejecting the null hypothesis. Only 2017-2018 saw a steady decrease in opponents points per game as a team got more steals per game.
OppPtsVsStealsByseason <- ggplot(TeamSeasonEFG, aes(SPG, OppPtsPerGame, col = WinCount)) + geom_point(size=3)+ geom_smooth(method = "lm", se = FALSE, color = "red")+ theme_classic()+ facet_wrap(~season) + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8)) + scale_color_gradient(low="green", high= "purple")+ ggtitle("Granular Season Over Season: Mean Points Allowed Per Game vs. Mean Steals Per Game")
OppPtsVsStealsByseason
Analysis:
Broken out over season to include win count, there is no statistically significant relationship between these variables when they interact with each other.
library(lme4)
## Loading required package: Matrix
randint <- lmer(OppPtsPerGame ~ SPG + (1|season), data = TeamSeasonEFG)
summary(randint)
## Linear mixed model fit by REML ['lmerMod']
## Formula: OppPtsPerGame ~ SPG + (1 | season)
## Data: TeamSeasonEFG
##
## REML criterion at convergence: 651.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.76828 -0.65967 0.02114 0.64232 2.21301
##
## Random effects:
## Groups Name Variance Std.Dev.
## season (Intercept) 7.96 2.821
## Residual 12.56 3.544
## Number of obs: 120, groups: season, 4
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 103.77582 3.34407 31.033
## SPG -0.01593 0.38896 -0.041
##
## Correlation of Fixed Effects:
## (Intr)
## SPG -0.901
ranef(randint)
## $season
## (Intercept)
## 2014-2015 -3.4565071
## 2015-2016 -0.9303196
## 2016-2017 1.8407279
## 2017-2018 2.5460987
##
## with conditional variances for "season"
library(merTools)
## Loading required package: arm
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
##
## arm (Version 1.10-1, built: 2018-4-12)
## Working directory is C:/Users/Tripl/OneDrive/Documents/R/Advanced Statistical Inference With Seth
##
## Attaching package: 'arm'
## The following object is masked from 'package:corrplot':
##
## corrplot
plotREsim(REsim(randint), labs = TRUE)
confint(randint)
## Computing profile confidence intervals ...
## 2.5 % 97.5 %
## .sig01 1.2713629 6.1132495
## .sigma 3.1193212 4.0367741
## (Intercept) 97.3285616 110.2321421
## SPG -0.7828262 0.7479671
I found this breakdown to be fascinating. In the linear model visualization, you can clearly see the evolution of basketball. In 2014-2015, opponents points per game went up as a team got more steals. This is incredibly counter-intuitive and causes some inferential questions.
ThirtyTeamSplit <- ggplot(TeamSeasonEFG, aes(season, meanEFG, group=Team)) + geom_line()+
geom_point(aes(color=WinCount)) +
scale_color_gradient(low="green", high= "blue")+
facet_wrap(~Team) +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8)) +
ggtitle("Mean EFG vs. Win Total Over Time")
ThirtyTeamSplit
The above visual is a facet wrap of all 30 NBA teams over the course of 4 seasons. It shows the rise or fall in mean effective field goal % and is colored to highlight win count. Most notable here would be the rise of the Philadelphia 76ers. This rise in both variables correlates nicely with “The Process” and the pivot from tanking team to on the rise playoff contender. It also highlights the unparalelled excellence of the Golden State Warriors.
MovingWinTotalEFGNBAVIZ <- TeamSeasonEFG %>%
plot_ly(
x = ~meanEFG,
y = ~WinCount,
color = ~Team,
frame = ~season,
text = ~Team,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
) %>%
layout(
xaxis = list(
type = "log"
)
)
MovingWinTotalEFGNBAVIZ
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
Similarly, this is a moving interactive graphic that lets you evaluate the team performance in EFG% and Win Total over the four seasons.
SizingWithThrees<- TeamSeasonEFG %>%
plot_ly(
x = ~meanEFG,
y = ~THREEPOINTSHOTSPERGAME,
size = ~WinCount,
color = ~Team,
frame = ~season,
text = ~Team,
hoverinfo = "text",
type = 'scatter',
mode = 'markers'
) %>%
layout(
xaxis = list(
type = "log"
)
)
SizingWithThrees
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
Here is a comparison of Mean EFG to 3 Point Shots Attempted, sizing the dots by win count in each season. The visual very clearly highlights the change in NBA play style. As teams began to embrace analytics, more and more teams began to see the value in shooting many more threes per game. As that went up, so too did meanEFG (a very good indicator of offensive efficiency being on the rise).
A2TO <- lm(TPG ~ APG, data = TeamSeasonEFG)
summary(A2TO)
##
## Call:
## lm(formula = TPG ~ APG, data = TeamSeasonEFG)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7381 -0.7603 -0.1902 0.5924 3.5112
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.88054 1.15445 9.425 4.61e-16 ***
## APG 0.12233 0.05099 2.399 0.018 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.161 on 118 degrees of freedom
## Multiple R-squared: 0.04651, Adjusted R-squared: 0.03843
## F-statistic: 5.756 on 1 and 118 DF, p-value: 0.018
A2TO30TM <- ggplot(TeamSeasonEFG, aes(season, APG, group=Team)) + geom_line()+
geom_point(aes(color=TPG, size = WinCount)) +
scale_color_gradient(low="red", high= "blue")+
facet_wrap(~Team) +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size=7)) +
ggtitle("APG vs. TPG")
A2TO30TM
AssistsVsTO <- TeamSeasonEFG %>%
plot_ly(
x = ~APG,
y = ~TPG,
size = ~WinCount,
color = ~Team,
frame = ~season,
text = ~Team,
hoverinfo = "size",
type = 'scatter',
mode = 'markers'
) %>%
layout(
xaxis = list(
type = "lm"
)
)
AssistsVsTO
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors